# This script prepares the different oversampled and undersampled training datasets to develop the childhood MRS-only models. 
# These cMRS-only training datasets prepared in this script will have had the following optimisation techniques applied: ADASYN oversampling and/or random undersampling to give 1:1 class balance
# Once the data is prepared, this script needs to be immediately followed by: "Model_development_XXX.txt", where XXX is the name of the different algorithms considered. 
# The data in file "/Childhood_MRS/MRSonly_model_508ID_standardised_training_dataset.csv" is found in IOWBC_MRS_data.xlsx, sheet: "cMRS standardised training"
# The data in files named "/Childhood_MRS/MRSonly_standardised_oversampled_training_dataset_XXX.csv" were developed using the script "Data_preparation_cMRS_oversampling.txt 
# Python version 3.6.8 was used 

# Imports
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.utils import shuffle

# Set working directory
os.chdir("/../../")

#######################
### Import datasets ###
#######################
# Construct both oversampled and undersampled datasets in the following way:
# data_0 = complete case		data_0_U = complete case, undersampled
# data_25_O = 25% oversampled cases		data_25_OU = 25% oversampled cases, undersampled controls to 1:1 class ratio

data_0 = pd.read_csv("/scratch/dk2e18/IoW_Methylation_Data/MRS/MRS_ML_model/Childhood_MRS/Childhood_MRSonly_model_508ID_standardised_training_dataset.csv", index_col=False)
del data_0['Unnamed: 0']
print('Original dataset shape %s' % Counter(data_0.Asthma_10YR))
# Original dataset shape Counter({0: 418, 1: 90})

# Undersample the controls 
s1 = data_0.loc[data_0['Asthma_10YR'] == 1]
s0 = data_0.loc[data_0['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:90,]
data_0_U = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_0_U = shuffle(data_0_U, random_state=123)
print('Original dataset shape %s' % Counter(data_0_U.Asthma_10YR))
# Original dataset shape Counter({0: 90, 1: 90})


data_25_O = pd.read_csv("/scratch/dk2e18/IoW_Methylation_Data/MRS/MRS_ML_model/Childhood_MRS/Childhood_MRSonly_standardised_oversampled_training_dataset_25%.csv", index_col=False)
data_25_O = data_25_O.iloc[0:531,:]
print('Original dataset shape %s' % Counter(data_25_O.Asthma_10YR))
# Original dataset shape Counter({0: 418, 1: 113})

# Undersample the controls 
s1 = data_25_O.loc[data_25_O['Asthma_10YR'] == 1]
s0 = data_25_O.loc[data_25_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:113,]
data_25_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_25_OU = shuffle(data_25_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_25_OU.Asthma_10YR))
# Original dataset shape Counter({0: 113, 1: 113})


data_50_O = pd.read_csv("/scratch/dk2e18/IoW_Methylation_Data/MRS/MRS_ML_model/Childhood_MRS/Childhood_MRSonly_standardised_oversampled_training_dataset_50%.csv", index_col=False)
data_50_O = data_50_O.iloc[0:553,:]
print('Original dataset shape %s' % Counter(data_50_O.Asthma_10YR))
# Original dataset shape Counter({0: 418, 1: 135})

# Undersample the controls 
s1 = data_50_O.loc[data_50_O['Asthma_10YR'] == 1]
s0 = data_50_O.loc[data_50_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:135,]
data_50_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_50_OU = shuffle(data_50_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_50_OU.Asthma_10YR))
# Original dataset shape Counter({0: 135, 1: 135})


data_100_O = pd.read_csv("/scratch/dk2e18/IoW_Methylation_Data/MRS/MRS_ML_model/Childhood_MRS/Childhood_MRSonly_standardised_oversampled_training_dataset_100%.csv", index_col=False)
del data_100_O['Unnamed: 0']
data_100_O = data_100_O.iloc[0:598,:]
print('Original dataset shape %s' % Counter(data_100_O.Asthma_10YR))
# Original dataset shape Counter({0: 418, 1: 180})

# Undersample the controls 
s1 = data_100_O.loc[data_100_O['Asthma_10YR'] == 1]
s0 = data_100_O.loc[data_100_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:180,]
data_100_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_100_OU = shuffle(data_100_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_100_OU.Asthma_10YR))
# Original dataset shape Counter({1: 180, 0: 180})


data_150_O = pd.read_csv("/scratch/dk2e18/IoW_Methylation_Data/MRS/MRS_ML_model/Childhood_MRS/Childhood_MRSonly_standardised_oversampled_training_dataset_150%.csv", index_col=False)
del data_150_O['Unnamed: 0']
data_150_O = data_150_O.iloc[0:643,:]
print('Original dataset shape %s' % Counter(data_150_O.Asthma_10YR))
# Original dataset shape Counter({0: 418, 1: 225})

# Undersample the controls 
s1 = data_150_O.loc[data_150_O['Asthma_10YR'] == 1]
s0 = data_150_O.loc[data_150_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:225,]
data_150_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_150_OU = shuffle(data_150_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_150_OU.Asthma_10YR))
# Original dataset shape Counter({1: 225, 0: 225})


data_200_O = pd.read_csv("/scratch/dk2e18/IoW_Methylation_Data/MRS/MRS_ML_model/Childhood_MRS/Childhood_MRSonly_standardised_oversampled_training_dataset_200%.csv", index_col=False)
del data_200_O['Unnamed: 0']
data_200_O = data_200_O.iloc[0:688,:]
print('Original dataset shape %s' % Counter(data_200_O.Asthma_10YR))
# Original dataset shape Counter({0: 418, 1: 270})

# Undersample the controls 
s1 = data_200_O.loc[data_200_O['Asthma_10YR'] == 1]
s0 = data_200_O.loc[data_200_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:270,]
data_200_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_200_OU = shuffle(data_200_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_200_OU.Asthma_10YR))
# Original dataset shape Counter({1: 270, 0: 270})


data_250_O = pd.read_csv("/scratch/dk2e18/IoW_Methylation_Data/MRS/MRS_ML_model/Childhood_MRS/Childhood_MRSonly_standardised_oversampled_training_dataset_250%.csv", index_col=False)
del data_250_O['Unnamed: 0']
data_250_O = data_250_O.iloc[0:733,:]
print('Original dataset shape %s' % Counter(data_250_O.Asthma_10YR))
# Original dataset shape Counter({0: 418, 1: 315})

# Undersample the controls 
s1 = data_250_O.loc[data_250_O['Asthma_10YR'] == 1]
s0 = data_250_O.loc[data_250_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:315,]
data_250_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_250_OU = shuffle(data_250_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_250_OU.Asthma_10YR))
# Original dataset shape Counter({0: 315, 1: 315})


data_300_O = pd.read_csv("/scratch/dk2e18/IoW_Methylation_Data/MRS/MRS_ML_model/Childhood_MRS/Childhood_MRSonly_standardised_oversampled_training_dataset_300%.csv", index_col=False)
del data_300_O['Unnamed: 0']
data_300_O = data_300_O.iloc[0:778,:]
print('Original dataset shape %s' % Counter(data_300_O.Asthma_10YR))
# Original dataset shape Counter({0: 418, 1: 360})

# Undersample the controls 
s1 = data_300_O.loc[data_300_O['Asthma_10YR'] == 1]
s0 = data_300_O.loc[data_300_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:360,]
data_300_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300_OU = shuffle(data_300_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_300_OU.Asthma_10YR))
# Original dataset shape Counter({0: 360, 1: 360}

# Assign all training datasets to be considered for model development into data object
data = [];
data.append(data_0);
data.append(data_25_O);
data.append(data_50_O);
data.append(data_100_O);
data.append(data_150_O);
data.append(data_200_O);
data.append(data_250_O);
data.append(data_300_O);
data.append(data_0_U);
data.append(data_25_OU);
data.append(data_50_OU);
data.append(data_100_OU);
data.append(data_150_OU);
data.append(data_200_OU);
data.append(data_250_OU);
data.append(data_300_OU)

# Set should be indexed according to the number of datasets included in the object data. This will be used during model development to loop through each training dataset. 
set = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

# Import standardised cMRS test data - data found in IOWBC_MRS_data.xlsx, sheet: "cMRS standardised test set"
test = pd.read_csv("/../Childhood_MRS/MRSonly_model_239ID_standardised_test_dataset.csv", index_col=False)
del test['Unnamed: 0']
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']
